Define heatmap function

library(pheatmap)

create_gene_heatmap <- function(de_results_df, dge_object, font_size = 12, num_genes = 20) {
  # Subset the genes
  top_genes <- de_results_df$Symbol[1:num_genes]
  subset_matrix <- dge_object$logCPM[top_genes, colnames(dge_object$counts)]
  
  # Create annotation dataframe
  sample_info <- dge_object$samples
  annotation_df <- data.frame(
    Donor = sample_info$Donor,
    Subset = sample_info$Subset
  )
  rownames(annotation_df) <- colnames(subset_matrix)
  
  # Define custom colors for annotations
  annotation_colors <- list(
    Subset = c(CD4 = "darkgreen", CD8 = "purple"),  # Custom distinct colors for Subset
    Donor = c(Donor_47 = "orange", Donor_64 = "blue")  # Custom distinct colors for Donor
  )
  
  # Reorder gene expression matrix
  ordered_samples <- order(sample_info$Subset, sample_info$Donor)
  subset_matrix <- subset_matrix[, ordered_samples]
  sample_info <- sample_info[ordered_samples, ]
  
  # Define custom color scheme for blue-white-red
  custom_colors <- colorRampPalette(c("blue", "white", "red"))(50)
  
  # Create the heatmap
  heatmap <- pheatmap(
    subset_matrix,
    scale = "row",
    cluster_rows = TRUE,
    cluster_cols = FALSE,
    treeheight_row = 0,
    color = custom_colors,  # Use blue-white-red color scheme
    show_colnames = FALSE,
    annotation_col = annotation_df,
    annotation_colors = annotation_colors,  # Add custom annotation colors
    fontsize = font_size,
    silent = FALSE
  )
}

TIRE-seq T cell human basic DE subsets

The aim of this notebook is to identify genes that differ between CD4 and CD8 subsets over time. This will be a basic DE analysis informed by the time course centric analysis conducted in notebook 3A splines timecourse.

Recap

I sequenced these samples with Prime-seq on extracted RNAs already. Results were not that great. Here reprocess the same samples with TIRE-seq for a head to head comparison.

Samples

  • Yasmin Nouri plates.
  • All wells have 10,000 cells in 50ul total volume.
  • Comprised of 25ul cells in media and 25ul 2x buffer TCL
  • The cell concentration is therefore 200 cells/uL.
  • So 20uL input will be 4000 cells.

Lab notes

The processing went as planned. Full writeup available at ELN link

Read data

This was generated in 2A_clustering

sce <- readRDS(here::here(
     "data/TIRE_Tcell/SCEs/tcell_cluster.sce.rds"
))
# Reorder factor levels
sce$Timepoint <- factor(sce$Timepoint, 
                            levels = c("Day_0", "Day_1", "Day_2",
                                       "Day_5", "Day_6", "Day_8",
                                       "Day_9", "Day_13", "Day_15"))

dge <- scran::convertTo(sce, type="edgeR")
tb <- as_tibble(dge$samples)

Check on the perturbations conducted in this experiment:

tb %>% 
  dplyr::count(Subset, Donor, Timepoint) %>% 
  arrange(Timepoint) %>% 
  head(10)

Filter for genes that have at least 5 counts.
Currently keep 8,249

keep <- filterByExpr(dge, group=dge$samples$Timepoint, min.count=5)
dge <- dge[keep,]
summary(keep)
##    Mode   FALSE    TRUE 
## logical   12666    8249

Correct for composition biases by computing normalization factors with the trimmed mean of M-values method.

dge <- calcNormFactors(dge)

Basic DE analysis

Combine time and subset into a new factor. For an interaction analysis it is easier to be more explicit than having fancy multiplication terms i.e timepoint * subset

The intercept term in a design matrix represents the baseline or reference level in your experimental design. Specifically:

  • When an intercept is included, it typically corresponds to the mean expression level of the reference group or condition.
  • Other coefficients in the model are then interpreted as differences from this baseline.
  • Have “0” in the model matrix means the intercept regressed out
  • When you include an intercept term in the model matrix, it adds a column of 1s to the design matrix. This column of 1s allows the regression line to intersect the y-axis at a non-zero point.
    • Removing the intercept term (by using model.matrix(~0 + .)) forces the regression line to go through the origin (y-axis at 0).
  • Including an intercept makes good sense when the first group represents a reference or control group, as all comparison are made with respect to this condition.
  • In the case of this experiment, removing the intercept means I can make a later contrast matrix that explicitly lists the comparisons of interest.
dge$samples$Subset_Time <- paste(dge$samples$Subset, dge$samples$Timepoint, sep="-")

sm <- model.matrix(~0+Subset_Time + Donor, data=dge$samples)
# hypens not allowed
colnames(sm) <- make.names(colnames(sm), unique = FALSE, allow_ = TRUE)

Look at some highly expressed genes

Convert the day from a factor to a numeric

sce$TimeDay <- str_split(sce$Timepoint, pattern = "_", simplify = T)[,2]
sce$TimeDay <- factor(sce$TimeDay, levels = sort(as.numeric(unique(sce$TimeDay))))
interest <- c("IL2", "IFNG", "CDK1", "MKI67")
sce <- logNormCounts(sce)

plt0 <- plotExpression(sce, features = interest, x = "TimeDay", assay.type = "logcounts",
                       colour_by = "Subset", exprs_values = "counts", ncol=4) + 
  theme_Publication() + xlab("Day") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

plt0

Differential expression analysis

Decide on the contrasts.

We focus on the early timepoints which is where all the changes take place.

contr.matrix <- makeContrasts(
   Day0 = Subset_TimeCD8.Day_0 - Subset_TimeCD4.Day_0,
   Day1 = Subset_TimeCD8.Day_1 - Subset_TimeCD4.Day_1,
   Day2 = Subset_TimeCD8.Day_2 - Subset_TimeCD4.Day_2,
   Day5 = Subset_TimeCD8.Day_5 - Subset_TimeCD4.Day_5,
   Day15 = Subset_TimeCD8.Day_15 - Subset_TimeCD4.Day_15,
   levels = colnames(sm))

contr.matrix %>% 
  kable()
Day0 Day1 Day2 Day5 Day15
Subset_TimeCD4.Day_0 -1 0 0 0 0
Subset_TimeCD4.Day_1 0 -1 0 0 0
Subset_TimeCD4.Day_13 0 0 0 0 0
Subset_TimeCD4.Day_15 0 0 0 0 -1
Subset_TimeCD4.Day_2 0 0 -1 0 0
Subset_TimeCD4.Day_5 0 0 0 -1 0
Subset_TimeCD4.Day_6 0 0 0 0 0
Subset_TimeCD4.Day_8 0 0 0 0 0
Subset_TimeCD8.Day_0 1 0 0 0 0
Subset_TimeCD8.Day_1 0 1 0 0 0
Subset_TimeCD8.Day_13 0 0 0 0 0
Subset_TimeCD8.Day_15 0 0 0 0 1
Subset_TimeCD8.Day_2 0 0 1 0 0
Subset_TimeCD8.Day_5 0 0 0 1 0
Subset_TimeCD8.Day_6 0 0 0 0 0
Subset_TimeCD8.Day_8 0 0 0 0 0
DonorDonor_64 0 0 0 0 0

In each contrast, the format is A - B where:

  • A represents the condition considered as the “treatment” or point of interest
  • B represents the condition considered as the “control” or baseline

So in this analysis the positive log fold changes will be CD8 cells.

Fit this model using limma/ voom.

par(mfrow=c(1,2))
v <- voom(dge, sm, plot=TRUE)

vfit <- lmFit(v, sm)
vfit <- contrasts.fit(vfit, contrasts=contr.matrix)
efit <- eBayes(vfit)
plotSA(efit, main="Final model: Mean-variance trend")

Perform the differential expression analysis.

Check how many genes are differentially expressed

summary(decideTests(efit))
##        Day0 Day1 Day2 Day5 Day15
## Down     70  114  394   35   121
## NotSig 8043 7909 7375 8172  8025
## Up      136  226  480   42   103

Visualise DE genes

Day 1

CD8 is highest which is a good sanity check.

Day1_cd8.vs.cd4 <- topTable(efit, coef=2, n=length(efit$genes$ID), sort.by = "logFC")
results <- as_tibble(Day1_cd8.vs.cd4)
results$ID <- rownames(Day1_cd8.vs.cd4)

# add a column of NAs
results$DElabel <- "n/s"
# if log2Foldchange > 1 or < -1 and pvalue < 0.05, set as "UP"
results$DElabel[results$logFC > 1 & results$adj.P.Val < 0.1] <- "CD8"
results$DElabel[results$logFC < -1 & results$adj.P.Val < 0.1] <- "CD4"

Add gene labels

results$genelabels <- ""
results$genelabels[results$Symbol == "CD8B"] <- "CD8B"
results$genelabels[results$Symbol == "CD8A"] <- "CD8A"
results$genelabels[results$Symbol == "XCL1"] <- "XCL1"
results$genelabels[results$Symbol == "FCGR3A"] <- "FCGR3A"

results$genelabels[results$Symbol == "CD4"] <- "CD4"
results$genelabels[results$Symbol == "TSHZ2"] <- "TSHZ2"
results$genelabels[results$Symbol == "FBLN7"] <- "FBLN7"
  • XCL1 is an inflammatory chemokine that is mainly secreted by activated CD8+ T cells and its biological function is the subject of renewed scrutiny.
    • The earliest reports described XCL1 as a mediator of T cell and NK cell chemotaxis
    • However, recent studies suggest a highly specialized role of the XCL1-XCR1 signaling axis in the mediation of interactions between antigen-presenting dendritic cells and T-cells.

Generate volcano plot

results$DElabel <- factor(results$DElabel, levels = c("CD8", "n/s", "CD4"))  # Adjust as per your actual labels

plt1 <- ggplot(data = results, aes(x = logFC, y = -log10(adj.P.Val), colour = DElabel, label = genelabels)) + 
  geom_point(alpha = 0.33, size = 1.5) +
  geom_text_repel(size = 4, colour = "black") +
  guides(colour = guide_legend(override.aes = list(size = 3, alpha = 1))) +
  scale_color_manual(
    values = c("darkorange", "grey", "darkblue"),  # Colors in the desired order
    name = "Upregulated",
    labels = c("CD8", "n/s", "CD4")  # Optional: Add custom labels
  ) +
  geom_vline(xintercept = 1, linetype = "dotted") + 
  geom_vline(xintercept = -1, linetype = "dotted") +
  theme_Publication()


plt1

Generate heatmap

dge$logCPM <- cpm(dge, log=TRUE, prior.count=1)
dge_subset <- dge[,dge$samples$Timepoint == "Day_1"]
create_gene_heatmap(de_results_df = results, dge_object = dge_subset, font_size = 12, num_genes=15)

Gene set testing with camera

Need to convert geneIDs from ensembl to enterez

geneids <- as.data.frame(v$genes$ID)
colnames(geneids) <- "ENSEMBL"

geneids$entrez <- mapIds(org.Hs.eg.db, keys = geneids$ENSEMBL, keytype = "ENSEMBL", column = "ENTREZID")

load("data/MSigDB/human_H_v5p2.rdata")
idx <- ids2indices(Hs.H,identifiers = geneids$entrez)

Myc targets up in CD4 and complement up in CD8

cam.day1 <- camera(v,idx,sm,contrast=contr.matrix[,2])
head(cam.day1,10)

Day 2

Day 2 is where most of the changes take place and the T cells are taking off after being stimulated.

Day2_cd8.vs.cd4 <- topTable(efit, coef=3, n=length(efit$genes$ID), sort.by = "logFC")
results <- as_tibble(Day2_cd8.vs.cd4)
results$ID <- rownames(Day2_cd8.vs.cd4)

# add a column of NAs
results$DElabel <- "n/s"
# if log2Foldchange > 1 or < -1 and pvalue < 0.05, set as "UP"
results$DElabel[results$logFC > 1 & results$adj.P.Val < 0.1] <- "CD8"
results$DElabel[results$logFC < -1 & results$adj.P.Val < 0.1] <- "CD4"

Add gene labels

results$genelabels <- ""
results$genelabels[results$Symbol == "CD8B"] <- "CD8B"
results$genelabels[results$Symbol == "CD8A"] <- "CD8A"
results$genelabels[results$Symbol == "CD8B2"] <- "CD8B2"
results$genelabels[results$Symbol == "XCL1"] <- "XCL1"

results$genelabels[results$Symbol == "CD4"] <- "CD4"
results$genelabels[results$Symbol == "TSHZ2"] <- "TSHZ2"
results$genelabels[results$Symbol == "FBLN7"] <- "FBLN7"
results$genelabels[results$Symbol == "CTSL"] <- "CTSL"

Generate volcano plot

# Reorder the levels of DElabel for proper legend ordering
results$DElabel <- factor(results$DElabel, levels = c("CD8", "n/s", "CD4"))  # Ensure correct order

plt2 <- ggplot(data = results, aes(x = logFC, y = -log10(adj.P.Val), colour = DElabel, label = genelabels)) + 
  geom_point(alpha = 0.33, size = 1.5) +
  geom_text_repel(size = 4, colour = "black") +
  guides(colour = guide_legend(override.aes = list(size = 3, alpha = 1))) +
  scale_color_manual(
    values = c("darkorange", "grey", "darkblue"),  # Colors in desired order
    name = "Upregulated",                         # Legend title
    labels = c("CD8", "n/s", "CD4")               # Correct legend labels
  ) +
  geom_vline(xintercept = 1, linetype = "dotted") + 
  geom_vline(xintercept = -1, linetype = "dotted") +
  theme_Publication()

plt2

Generate heatmap

dge$logCPM <- cpm(dge, log=TRUE, prior.count=1)
dge_subset <- dge[,dge$samples$Timepoint == "Day_2"]
create_gene_heatmap(de_results_df = results, dge_object = dge_subset, font_size = 12, num_genes=15)

Gene set testing with camera

At day 2:

  • Proliferation higher in CD8
  • Inferon / cytokine signalling up in CD4
cam.day2 <- camera(v,idx,sm,contrast=contr.matrix[,3])
head(cam.day2,10)

Visualize as a barplot

geom_GeneSet_Barchart(cam.day2)

Day 15

Some chemokines here but nothing interesting

Day15_cd8.vs.cd4 <- topTable(efit, coef=5, n=length(efit$genes$ID), sort.by = "logFC")
results <- as_tibble(Day15_cd8.vs.cd4)
results$ID <- rownames(Day15_cd8.vs.cd4)

# add a column of NAs
results$DElabel <- "n/s"
# if log2Foldchange > 1 or < -1 and pvalue < 0.05, set as "UP"
results$DElabel[results$logFC > 1 & results$adj.P.Val < 0.1] <- "CD8"
results$DElabel[results$logFC < -1 & results$adj.P.Val < 0.1] <- "CD4"

Add gene labels

results$genelabels <- ""
results$genelabels[results$Symbol == "CD8B"] <- "CD8B"
results$genelabels[results$Symbol == "CD8A"] <- "CD8A"
results$genelabels[results$Symbol == "CD8B2"] <- "CD8B2"
results$genelabels[results$Symbol == "ZNF683"] <- "ZNF683"

results$genelabels[results$Symbol == "CD4"] <- "CD4"
results$genelabels[results$Symbol == "CCL1"] <- "CCL1"
results$genelabels[results$Symbol == "GZMB"] <- "GZMB"
results$genelabels[results$Symbol == "XCL1"] <- "XCL1"

Generate volcano plot

results$DElabel <- factor(results$DElabel, levels = c("CD8", "n/s", "CD4"))  # Ensure correct order

plt3 <- ggplot(data = results, aes(x = logFC, y = -log10(adj.P.Val), colour = DElabel, label = genelabels)) + 
  geom_point(alpha = 0.33, size = 1.5) +
  geom_text_repel(size = 4, colour = "black") +
  guides(colour = guide_legend(override.aes = list(size = 3, alpha = 1))) +
  scale_color_manual(
    values = c("darkorange", "grey", "darkblue"),  # Colors in desired order
    name = "Upregulated",                         # Legend title
    labels = c("CD8", "n/s", "CD4")               # Correct legend labels
  ) +
  geom_vline(xintercept = 1, linetype = "dotted") + 
  geom_vline(xintercept = -1, linetype = "dotted") +
  theme_Publication()

plt3

Day 5

Some chemokines here but nothing interesting

Day5_cd8.vs.cd4 <- topTable(efit, coef=4, n=length(efit$genes$ID), sort.by = "logFC")
results <- as_tibble(Day5_cd8.vs.cd4)
results$ID <- rownames(Day5_cd8.vs.cd4)

# add a column of NAs
results$DElabel <- "n/s"
# if log2Foldchange > 1 or < -1 and pvalue < 0.05, set as "UP"
results$DElabel[results$logFC > 1 & results$adj.P.Val < 0.1] <- "CD8"
results$DElabel[results$logFC < -1 & results$adj.P.Val < 0.1] <- "CD4"

Add gene labels

results$genelabels <- ""
results$genelabels[results$Symbol == "CD8B"] <- "CD8B"
results$genelabels[results$Symbol == "CD8A"] <- "CD8A"
results$genelabels[results$Symbol == "LINC02446"] <- "LINC02446"
results$genelabels[results$Symbol == "KLRD1"] <- "KLRD1"

results$genelabels[results$Symbol == "CD4"] <- "CD4"
results$genelabels[results$Symbol == "CD40LG"] <- "CD40LG"
results$genelabels[results$Symbol == "TIMP1"] <- "TIMP1"
results$genelabels[results$Symbol == "FHIT"] <- "FHIT"

Generate volcano plot

plt4 <- ggplot(data=results, aes(x=logFC, y=-log10(adj.P.Val), colour=DElabel, label=genelabels)) + 
  geom_point(alpha=0.33, size=1.5) +
  geom_text_repel(size=4, colour="black", max.overlaps = 30) +
  guides(colour = guide_legend(override.aes = list(size=3, alpha=1))) +
  scale_color_manual(values = c("darkblue", "darkorange", "grey"), name = "Upregulated") +
  geom_vline(xintercept = 1, linetype="dotted") + 
  geom_vline(xintercept = -1, linetype="dotted") +
  theme_Publication()

plt4

Conclusion

Genes are all expected. Good to see CD4 and CD8 as most DE genes.

Session info

sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Red Hat Enterprise Linux 9.4 (Plow)
## 
## Matrix products: default
## BLAS:   /stornext/System/data/software/rhel/9/base/tools/R/4.4.1/lib64/R/lib/libRblas.so 
## LAPACK: /stornext/System/data/software/rhel/9/base/tools/R/4.4.1/lib64/R/lib/libRlapack.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Australia/Melbourne
## tzcode source: system (glibc)
## 
## attached base packages:
##  [1] grid      splines   stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] ggthemes_5.1.0              here_1.0.1                 
##  [3] knitr_1.48                  patchwork_1.3.0            
##  [5] org.Hs.eg.db_3.19.1         AnnotationDbi_1.66.0       
##  [7] ggrepel_0.9.6               viridis_0.6.5              
##  [9] viridisLite_0.4.2           pheatmap_1.0.12            
## [11] edgeR_4.2.2                 limma_3.60.6               
## [13] scater_1.32.1               scuttle_1.14.0             
## [15] lubridate_1.9.3             forcats_1.0.0              
## [17] stringr_1.5.1               dplyr_1.1.4                
## [19] purrr_1.0.2                 readr_2.1.5                
## [21] tidyr_1.3.1                 tibble_3.2.1               
## [23] ggplot2_3.5.1               tidyverse_2.0.0            
## [25] SingleCellExperiment_1.26.0 SummarizedExperiment_1.34.0
## [27] Biobase_2.64.0              GenomicRanges_1.56.2       
## [29] GenomeInfoDb_1.40.1         IRanges_2.38.1             
## [31] S4Vectors_0.42.1            BiocGenerics_0.50.0        
## [33] MatrixGenerics_1.16.0       matrixStats_1.4.1          
## 
## loaded via a namespace (and not attached):
##  [1] DBI_1.2.3                 gridExtra_2.3            
##  [3] rlang_1.1.4               magrittr_2.0.3           
##  [5] compiler_4.4.1            RSQLite_2.3.7            
##  [7] DelayedMatrixStats_1.26.0 png_0.1-8                
##  [9] vctrs_0.6.5               pkgconfig_2.0.3          
## [11] crayon_1.5.3              fastmap_1.2.0            
## [13] XVector_0.44.0            labeling_0.4.3           
## [15] utf8_1.2.4                rmarkdown_2.28           
## [17] tzdb_0.4.0                UCSC.utils_1.0.0         
## [19] ggbeeswarm_0.7.2          bit_4.5.0                
## [21] bluster_1.14.0            xfun_0.48                
## [23] zlibbioc_1.50.0           cachem_1.1.0             
## [25] beachmat_2.20.0           jsonlite_1.8.9           
## [27] blob_1.2.4                highr_0.11               
## [29] DelayedArray_0.30.1       BiocParallel_1.38.0      
## [31] cluster_2.1.6             irlba_2.3.5.1            
## [33] parallel_4.4.1            R6_2.5.1                 
## [35] bslib_0.8.0               stringi_1.8.4            
## [37] RColorBrewer_1.1-3        jquerylib_0.1.4          
## [39] Rcpp_1.0.13               igraph_2.0.3             
## [41] Matrix_1.7-0              timechange_0.3.0         
## [43] tidyselect_1.2.1          rstudioapi_0.17.0        
## [45] abind_1.4-8               yaml_2.3.10              
## [47] codetools_0.2-20          lattice_0.22-6           
## [49] KEGGREST_1.44.1           withr_3.0.1              
## [51] evaluate_1.0.1            Biostrings_2.72.1        
## [53] pillar_1.9.0              generics_0.1.3           
## [55] rprojroot_2.0.4           hms_1.1.3                
## [57] sparseMatrixStats_1.16.0  munsell_0.5.1            
## [59] scales_1.3.0              glue_1.8.0               
## [61] metapod_1.12.0            tools_4.4.1              
## [63] BiocNeighbors_1.22.0      ScaledMatrix_1.12.0      
## [65] locfit_1.5-9.10           scran_1.32.0             
## [67] cowplot_1.1.3             colorspace_2.1-1         
## [69] GenomeInfoDbData_1.2.12   beeswarm_0.4.0           
## [71] BiocSingular_1.20.0       vipor_0.4.7              
## [73] cli_3.6.3                 rsvd_1.0.5               
## [75] fansi_1.0.6               S4Arrays_1.4.1           
## [77] gtable_0.3.5              sass_0.4.9               
## [79] digest_0.6.37             dqrng_0.4.1              
## [81] SparseArray_1.4.8         farver_2.1.2             
## [83] memoise_2.0.1             htmltools_0.5.8.1        
## [85] lifecycle_1.0.4           httr_1.4.7               
## [87] statmod_1.5.0             bit64_4.5.2